banner

Predict the popularity of board games on BoardGameGeek.com

Kaggle link

Twitch link

Load libraries

library(tidyverse)
library(tidymodels)
library(scales)
library(skimr)
library(kableExtra)

# colours
#pal_jo <- c(viridisLite::magma(8)[2:8], "#4C4C53", "#9B9BA8")

pal_sliced <- c(
  '#e946a5', # pink
  '#68e199', # green
  '#302df0', # blue
  '#6a45b0', # purple
  '#129875', # darker green
  '#4f94bf', # lighter blue
  '#b545ab', # darker pink
  '#000000', # black,
  '#cccccc'  #grey
)

title_colour <- pal_sliced[1]
table_colour <- pal_sliced[4]
bar_colour <- pal_sliced[2]
grid_colour <- pal_sliced[9]

theme_set(theme_bw() %+replace%
    theme(
      # align title and caption to the plot not the panel
      plot.title.position = 'plot',
      plot.caption.position = 'plot',
      # change the title and caption to markdown and move them futher from the plot
      plot.title = element_text(
        size = rel(1.3),
        hjust = 0, 
        margin = margin(c(0, 0, 10, 0)),
        colour = title_colour
      ),
      plot.subtitle = element_text(
        size = rel(1.15),
        hjust = 0, 
        margin = margin(c(0, 0, 15, 0))
      ),
      plot.caption = element_text(
        hjust = 1, 
        margin = margin(c(10, 0, 0, 0))
      ),
      # move axis titles to the left/top and change them to markdown
      axis.title = element_text(hjust = 1),
      # allow the axis values to the markdown as well
      axis.text = element_text(),
      # remove the panel border
      panel.border = element_blank(),
      # put in the axis lines with a slightly thicker line than the gridlines
      axis.line = element_line(colour = grid_colour, size = rel(1.5)),
      # make the tickmarks the same colour
      axis.ticks = element_line(colour = grid_colour),
      # facet strip text left aligned with extra space above
      strip.text = element_text(
        hjust = 0, margin = margin(c(10, 0, 0, 0)), colour = title_colour
      ),
      # clear colour and fill for strip
      strip.background = element_rect(colour = NA, fill = NA),
      # dotted gridlines
      panel.grid = element_line(linetype = 'dotted'),
      # ability to use a different colour for the gridlines
      panel.grid.major.x = element_line(colour = grid_colour),
      panel.grid.major.y = element_line(colour = grid_colour),
      panel.grid.minor.x = element_blank(),
      panel.grid.minor.y = element_blank(),
    )
)

scale_y_pct <- function(
  accuracy = 1L, 
  breaks = pretty_breaks(),
  expand = expansion(mult = c(0, .05)),
  ...
) {
  scale_y_continuous(
    labels = scales::percent_format(accuracy = accuracy, big.mark = ","),
    breaks = breaks,
    expand = expand,
    ...
  )
}

scale_y_comma <- function(
  accuracy = 1L, 
  breaks = pretty_breaks(),
  expand = expansion(mult = c(0, .05)),
  ...
) {
  scale_y_continuous(
    labels = scales::comma_format(accuracy = accuracy),
    breaks = breaks,
    expand = expand,
    ...
  )
}

scale_fill_jo <- function(...) {
  scale_fill_manual(values = pal_sliced, ...)
}

scale_fill_discrete <- scale_fill_jo

update_geom_defaults("bar", list(fill = bar_colour))
update_geom_defaults("col", list(fill = bar_colour))
update_geom_defaults("point", list(colour = bar_colour))
update_geom_defaults("line", list(colour = bar_colour))

Read files

to_build <- read_csv("s01e01/train.csv", guess_max = 200000) 

── Column specification ────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  game_id = col_double(),
  min_players = col_double(),
  max_players = col_double(),
  avg_time = col_double(),
  min_time = col_double(),
  max_time = col_double(),
  year = col_double(),
  geek_rating = col_double(),
  num_votes = col_double(),
  age = col_double(),
  owned = col_double()
)
ℹ Use `spec()` for the full column specifications.
to_score <- read_csv("s01e01/test.csv", guess_max = 200000) %>%
  mutate(
    category11 = as.character(category11),
    category12 = as.character(category12)
  )

── Column specification ────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  game_id = col_double(),
  min_players = col_double(),
  max_players = col_double(),
  avg_time = col_double(),
  min_time = col_double(),
  max_time = col_double(),
  year = col_double(),
  num_votes = col_double(),
  age = col_double(),
  owned = col_double(),
  category11 = col_logical(),
  category12 = col_logical()
)
ℹ Use `spec()` for the full column specifications.

Examine data

skim(to_build)
Data summary
Name to_build
Number of rows 3499
Number of columns 26
_______________________
Column type frequency:
character 15
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
names 0 1.00 1 82 0 3485 0
mechanic 0 1.00 4 258 0 1758 0
category1 0 1.00 4 26 0 78 0
category2 611 0.83 4 26 0 81 0
category3 1773 0.49 4 26 0 72 0
category4 2636 0.25 4 25 0 62 0
category5 3098 0.11 4 25 0 45 0
category6 3363 0.04 4 25 0 38 0
category7 3453 0.01 4 25 0 21 0
category8 3480 0.01 5 19 0 14 0
category9 3494 0.00 7 19 0 5 0
category10 3495 0.00 6 25 0 3 0
category11 3498 0.00 9 9 0 1 0
category12 3498 0.00 15 15 0 1 0
designer 0 1.00 4 157 0 1905 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
game_id 0 1 89632.37 77040.84 2.00 11164.50 73538.0 160677.0 244522.0 ▇▂▃▃▂
min_players 0 1 2.01 0.67 0.00 2.00 2.0 2.0 8.0 ▂▇▁▁▁
max_players 0 1 5.06 7.24 0.00 4.00 4.0 6.0 200.0 ▇▁▁▁▁
avg_time 0 1 117.24 487.78 0.00 30.00 60.0 120.0 22500.0 ▇▁▁▁▁
min_time 0 1 82.53 214.19 0.00 30.00 45.0 90.0 5400.0 ▇▁▁▁▁
max_time 0 1 116.80 487.84 0.00 30.00 60.0 120.0 22500.0 ▇▁▁▁▁
year 0 1 1996.10 161.95 -3000.00 2003.00 2011.0 2015.0 2018.0 ▁▁▁▁▇
geek_rating 0 1 6.09 0.48 5.64 5.73 5.9 6.3 8.5 ▇▂▁▁▁
num_votes 0 1 2006.03 4644.62 62.00 281.00 618.0 1640.0 77423.0 ▇▁▁▁▁
age 0 1 10.43 3.22 0.00 8.00 11.0 12.0 42.0 ▃▇▁▁▁
owned 0 1 3054.96 6369.31 49.00 622.50 1204.0 2723.0 111807.0 ▇▁▁▁▁

Target - geek rating

summary(to_build$geek_rating)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  5.640   5.726   5.905   6.088   6.303   8.502 
ggplot(to_build, aes(geek_rating)) +
  geom_histogram(binwidth = 0.2) +
  scale_y_comma() +
  labs(title = 'Geek rating', x = NULL, y = NULL, fill = 'Rating > 7')

Score over 7 looks good

Number of players

  • if zero reset to missing and impute
  • if over 20 reset to 20
summary(to_build$min_players)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   2.000   2.000   2.015   2.000   8.000 
ggplot(to_build, aes(min_players, fill = geek_rating >= 7)) +
  geom_histogram(binwidth = 1) +
  scale_y_comma() +
  labs(title = 'Min players', x = NULL, y = NULL, fill = 'Rating > 7')

ggplot(to_build, aes(min_players, fill = geek_rating >= 7)) +
  geom_histogram(binwidth = 1, position = 'fill') +
  scale_y_pct() +
  labs(title = 'Min players', x = NULL, y = NULL, fill = 'Rating > 7')

summary(to_build$max_players)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   4.000   4.000   5.061   6.000 200.000 
ggplot(to_build, aes(max_players, fill = geek_rating >= 7)) +
  geom_histogram(binwidth = 1) +
  xlim(0, 20) +
  scale_y_comma() +
  labs(title = 'Max players', x = NULL, y = NULL, fill = 'Rating > 7')

ggplot(to_build, aes(max_players, fill = geek_rating >= 7)) +
  geom_histogram(binwidth = 1, position = 'fill') +
  xlim(0, 20) +
  scale_y_pct() +
  labs(title = 'Max players', x = NULL, y = NULL, fill = 'Rating > 7')

Year

  • if zero then impute
  • otherwise if < 1900 then set to 1900
summary(to_build$year)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  -3000    2003    2011    1996    2015    2018 
ggplot(to_build, aes(year, fill = geek_rating >= 7)) +
  geom_histogram(binwidth = 10) +
  scale_y_comma() +
  labs(title = 'Year', x = NULL, y = NULL, fill = 'Rating > 7')

to_build %>%
  filter(year < 0) %>%
  arrange(year) %>%
  select(names, year) %>%
  kable( 
    escape = FALSE,
    padding = 5
  ) %>% 
  kable_styling(
    bootstrap_options = c("striped", "hover"), 
    fixed_thead = TRUE, 
    position = "left"
  ) %>%
  row_spec(0, color = "white", background = table_colour)
names year
Backgammon -3000
Go -2200
to_build %>%
  filter(year >= 0 & year < 1900) %>%
  count(year) %>%
  kable( 
    escape = FALSE,
    padding = 5
  ) %>% 
  kable_styling(
    bootstrap_options = c("striped", "hover"), 
    fixed_thead = TRUE, 
    position = "left"
  ) %>%
  row_spec(0, color = "white", background = table_colour)
year n
0 10
400 1
550 1
700 1
1000 1
1425 1
1600 2
1663 1
1701 1
1715 1
1780 1
1800 2
1810 1
1848 1
1850 1
1870 1
1876 1
1883 1
1895 1
summary(to_build$year[to_build$year > 1900])
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1904    2004    2011    2007    2015    2018 
ggplot(to_build, aes(year, fill = geek_rating >= 7)) +
  geom_histogram(binwidth = 10)  +
  xlim(1900, 2020) +
  scale_y_comma() +
  labs(title = 'Year', x = NULL, y = NULL, fill = 'Rating > 7')

ggplot(to_build, aes(year, fill = geek_rating >= 7)) +
  geom_histogram(binwidth = 10, position = 'fill')  +
  xlim(1900, 2020) +
  scale_y_pct() +
  labs(title = 'Year', x = NULL, y = NULL, fill = 'Rating > 7')

Time

  • if zero reset to missing and impute
  • if > 240 then set to 240
summary(to_build$avg_time)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0    30.0    60.0   117.2   120.0 22500.0 
ggplot(to_build, aes(avg_time, fill = geek_rating >= 7)) +
  geom_histogram(binwidth = 15) +
  xlim(0, 240) +
  scale_y_comma() +
  labs(title = 'Average Time', x = NULL, y = NULL, fill = 'Rating > 7')

ggplot(to_build, aes(avg_time, fill = geek_rating >= 7)) +
  geom_histogram(binwidth = 15, position = 'fill') +
  xlim(0, 240) +
  scale_y_pct() +
  labs(title = 'Average Time', x = NULL, y = NULL, fill = 'Rating > 7')  

summary(to_build$min_time)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00   30.00   45.00   82.53   90.00 5400.00 
ggplot(to_build, aes(min_time, fill = geek_rating >= 7)) +
  geom_histogram(binwidth = 15) +
  xlim(0, 240) +
  scale_y_comma() +
  labs(title = 'Min Time', x = NULL, y = NULL, fill = 'Rating > 7')

ggplot(to_build, aes(min_time, fill = geek_rating >= 7)) +
  geom_histogram(binwidth = 15, position = 'fill') +
  xlim(0, 240) +
  scale_y_pct() +
  labs(title = 'Min Time', x = NULL, y = NULL, fill = 'Rating > 7')

summary(to_build$max_time)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0    30.0    60.0   116.8   120.0 22500.0 
ggplot(to_build, aes(max_time, fill = geek_rating >= 7)) +
  geom_histogram(binwidth = 15) +
  xlim(0, 240) +
  scale_y_comma() +
  labs(title = 'Max Time', x = NULL, y = NULL, fill = 'Rating > 7')

ggplot(to_build, aes(max_time, fill = geek_rating >= 7)) +
  geom_histogram(binwidth = 15, position = 'fill') +
  xlim(0, 240) +
  scale_y_pct() +
  labs(title = 'Max Time', x = NULL, y = NULL, fill = 'Rating > 7')

Number of votes

  • log transform
summary(to_build$num_votes)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
     62     281     618    2006    1640   77423 
ggplot(to_build, aes(num_votes, fill = geek_rating >= 7)) +
  geom_histogram(bins = 50) +
  scale_x_log10(label = comma_format()) +
  scale_y_comma() +
  labs(title = 'Number of votes', x = NULL, y = NULL, fill = 'Rating > 7')

ggplot(to_build, aes(num_votes, fill = geek_rating >= 7)) +
  geom_histogram(bins = 50, position = 'fill') +
  scale_x_log10(label = comma_format()) +
  scale_y_pct() +
  labs(title = 'Number of votes', x = NULL, y = NULL, fill = 'Rating > 7')

Owned

  • log transform
summary(to_build$owned)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
    49.0    622.5   1204.0   3055.0   2723.0 111807.0 
ggplot(to_build, aes(owned, fill = geek_rating >= 7)) +
  geom_histogram(bins = 50) +
  scale_x_log10(label = comma_format()) +
  scale_y_comma() +
  labs(title = 'Owned', x = NULL, y = NULL, fill = 'Rating > 7')

ggplot(to_build, aes(owned, fill = geek_rating >= 7)) +
  geom_histogram(bins = 50, position = 'fill') +
  scale_x_log10(label = comma_format()) +
  scale_y_pct() +
  labs(title = 'Owned', x = NULL, y = NULL, fill = 'Rating > 7')

Age

  • looks okay
summary(to_build$age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    8.00   11.00   10.43   12.00   42.00 
ggplot(to_build, aes(age, fill = geek_rating >= 7)) +
  geom_histogram(binwidth = 1) +
  xlim(0, 20) +
  scale_y_comma() +
  labs(title = 'Age', x = NULL, y = NULL, fill = 'Rating > 7')

ggplot(to_build, aes(age, fill = geek_rating >= 7)) +
  geom_histogram(binwidth = 1, position = 'fill')  +
  xlim(0, 20) +
  scale_y_pct() +
  labs(title = 'Age', x = NULL, y = NULL, fill = 'Rating > 7')

Common categories

  • flags where 5% +
top_cat <- to_build %>%
  select(starts_with('cate')) %>%
  gather(key = 'key', value = 'value') %>%
  filter(! is.na(value)) %>%
  count(value, sort = TRUE) %>%
  #slice(1:20) %>%
  select(value, n) %>%
  mutate(pct = n /  nrow(to_build)) %>%
  filter(pct > 0.05)

ggplot(top_cat, aes(fct_reorder(value, n), pct)) +
  geom_col() +
  coord_flip() +
  scale_y_pct() +
  labs(title = 'Top categories', x = NULL, y = NULL, fill = 'Rating > 7')

Common mechanic

  • flags where 5% +
top_mechanic <- to_build %>%
  select(mechanic) %>%
  mutate(mechanic = str_split(mechanic, ',')) %>%
  unnest(mechanic) %>%
  mutate(mechanic = str_trim(mechanic)) %>%
  count(mechanic, sort = TRUE) %>%
  #slice(1:20) %>%
  select(mechanic, n) %>%
  mutate(pct = n /  nrow(to_build)) %>%
  filter(pct > 0.05)

ggplot(top_mechanic, aes(fct_reorder(mechanic, n), pct)) +
  geom_col() +
  coord_flip() +
  scale_y_pct() +
  labs(title = 'Top mechanics', x = NULL, y = NULL, fill = 'Rating > 7')

Designer

top_designer <- to_build %>%
  select(designer)%>%
  mutate(designer = str_split(designer, ',')) %>%
  unnest(designer) %>%
  mutate(designer = str_trim(designer)) %>%
  count(designer, sort = TRUE) %>%
  filter(! designer %in% c('(Uncredited)', 'Jr.')) %>%
  slice(1:10) %>%
  select(designer, n) %>%
  mutate(pct = n /  nrow(to_build))

ggplot(top_designer, aes(fct_reorder(designer, n), pct)) +
  geom_col() +
  coord_flip()  +
  scale_y_pct(accuracy = 0.1) +
  labs(title = 'Top designers', x = NULL, y = NULL, fill = 'Rating > 7')

Split in to train and test

set.seed(539)

split <- initial_split(to_build, strata = geek_rating)
train <- training(split)
test <- testing(split)

Create recipe

rec <- recipe(geek_rating ~ ., data = train) %>%
  # id role for game_id
  update_role(game_id, new_role = "ID") %>%
  # flags for categories
  step_mutate(
    
    cat_card_game = if_any(starts_with("cate"), ~ .x == "Card Game"),
    cat_card_game =as.numeric( replace_na(cat_card_game, FALSE)),
    
    cat_wargame = if_any(starts_with("cate"), ~ .x == "Wargame"),
    cat_wargame = as.numeric(replace_na(cat_wargame, FALSE)),
    
    cat_fantasy = if_any(starts_with("cate"), ~ .x == "Fantasy"),
    cat_fantasy = as.numeric(replace_na(cat_fantasy, FALSE)),
    
    cat_economic = if_any(starts_with("cate"), ~ .x == "Economic"),
    cat_economic = as.numeric(replace_na(cat_economic, FALSE)),
    
    cat_fighting = if_any(starts_with("cate"), ~ .x == "Fighting"),
    cat_fighting = as.numeric(replace_na(cat_fighting, FALSE)),
    
    cat_science_fiction = if_any(starts_with("cate"), ~ .x == 
      "Science Fiction"),
    cat_science_fiction = as.numeric(replace_na(cat_science_fiction, FALSE)),
    
    cat_dice = if_any(starts_with("cate"), ~ .x == "Dice"),
    cat_dice = as.numeric(replace_na(cat_dice, FALSE)),
    
    cat_bluffing = if_any(starts_with("cate"), ~ .x == "Bluffing"),
    cat_bluffing = as.numeric(replace_na(cat_bluffing, FALSE)),
    
    cat_adventure = if_any(starts_with("cate"), ~ .x == "Adventure"),
    cat_adventure = as.numeric(replace_na(cat_adventure, FALSE)),
    
    cat_miniatures = if_any(starts_with("cate"), ~ .x == "Miniatures"),
    cat_miniatures = as.numeric(replace_na(cat_miniatures, FALSE)),
    
    cat_ww2 = if_any(starts_with("cate"), ~ .x == "World War II"),
    cat_ww2 = as.numeric(replace_na(cat_ww2, FALSE)),
    
    cat_medieval = if_any(starts_with("cate"), ~ .x == "Medieval"),
    cat_medieval = as.numeric(replace_na(cat_medieval, FALSE)),
    
    cat_explor = if_any(starts_with("cate"), ~ .x == "Exploration"),
    cat_explor = as.numeric(replace_na(cat_explor, FALSE)),
    
    cat_deduct = if_any(starts_with("cate"), ~ .x == "Deduction"),
    cat_deduct = as.numeric(replace_na(cat_deduct, FALSE)),
    
    cat_party = if_any(starts_with("cate"), ~ .x == "Party Game"),
    cat_party = as.numeric(replace_na(cat_party, FALSE)),
    
    cat_abstract = if_any(starts_with("cate"), ~ .x == "Abstract Strategy"),
    cat_abstract = as.numeric(replace_na(cat_abstract, FALSE)),
    
    cat_animals = if_any(starts_with("cate"), ~ .x == "Animals"),
    cat_animals = as.numeric(replace_na(cat_animals, FALSE))
  ) %>%
  # flags for mechanic
  step_mutate(
    mech_dice = as.numeric(str_detect(mechanic, "Dice Rolling")),
    mech_hand = as.numeric(str_detect(mechanic, "Hand Management" )),
    mech_powers = as.numeric(str_detect(mechanic, "Variable Player Powers")),
    mech_sets = as.numeric(str_detect(mechanic, "Set Collection")),
    mech_infl = as.numeric(str_detect(mechanic, 
      "Area Control / Area Influence")),
    mech_draft = as.numeric(str_detect(mechanic, "Card Drafting")),
    mech_modular = as.numeric(str_detect(mechanic, "Modular Board")),
    mech_tile = as.numeric(str_detect(mechanic, "Tile Placement")),
    mech_hex = as.numeric(str_detect(mechanic, "Hex-and-Counter")),
    mech_action = as.numeric(str_detect(mechanic, 
      "Action Point Allowance System")),
    mech_coop = as.numeric(str_detect(mechanic, "Co-operative Play")),
    mech_sas = as.numeric(str_detect(mechanic, 
      "Simultaneous Action Selection")),
    mech_auction = as.numeric(str_detect(mechanic, "Auction/Bidding")),
    mech_area = as.numeric(str_detect(mechanic, "Area Movement")),
    mech_worker = as.numeric(str_detect(mechanic, "Worker Placement")),
    mech_grid = as.numeric(str_detect(mechanic, "Grid Movement")),
    mech_simul = as.numeric(str_detect(mechanic, "Simulation")),
    mech_deck = as.numeric(str_detect(mechanic, "Deck / Pool Building")),
    mech_partner = as.numeric(str_detect(mechanic, "Partnerships")),
    mech_point = as.numeric(str_detect(mechanic, "Point to Point Movement")),
    mech_route = as.numeric(str_detect(mechanic, "Route/Network Building"))  
  ) %>%
  # flags for mechanic
  step_mutate(
    design_1 = as.numeric(str_detect(designer, "Reiner Knizia")),
    design_2 = as.numeric(str_detect(designer, "Martin Wallace")),
    design_3 = as.numeric(str_detect(designer, "Wolfgang Kramer")),
    design_4 = as.numeric(str_detect(designer, "Dean Essig")),
    design_5 = as.numeric(str_detect(designer, "Alan R. Moon")),
    design_6 = as.numeric(str_detect(designer, "Bruno Cathala")),
    design_7 = as.numeric(str_detect(designer, "Friedemann Friese")),
    design_8 = as.numeric(str_detect(designer, "Mike Elliott")),
    design_9 = as.numeric(str_detect(designer, "Klaus Teuber")),
    design_10 = as.numeric(str_detect(designer, "Richard H. Berg"))
  ) %>%
  # remove the name, designer, mechanic and category variables
  step_rm(names, mechanic, designer, category1:category12) %>%
  # log transformation
  step_log(num_votes, owned) %>%
  # outliers
  step_mutate(
    min_players = case_when(
      min_players == 0 ~ as.numeric(NA), 
      min_players > 20 ~ 20,
      TRUE ~ min_players
    ),
    max_players = case_when(
      max_players == 0 ~ as.numeric(NA), 
      max_players > 20 ~ 20,
      TRUE ~ max_players
    ),
    min_time = case_when(
      min_time == 0 ~ as.numeric(NA),
      min_time > 240 ~ 240,
      TRUE ~ min_time
    ),
    max_time = case_when(
      max_time == 0 ~ as.numeric(NA),
      max_time > 240 ~ 240,
      TRUE ~ max_time
    ),
    avg_time = case_when(
      avg_time == 0 ~ as.numeric(NA),
      avg_time > 240 ~ 240,
      TRUE ~ avg_time
    ),
    year = case_when(
        year == 0 ~ as.numeric(NA), 
        year < 1900 ~ 1900,
        TRUE ~ year
    )
  ) %>%
  # imputation
  step_impute_mean(min_players, max_players, year, min_time, max_time, avg_time)

Check recipe outputs

prepped <- prep(rec)
baked <- bake(prepped, new_data = NULL) 

skim(baked)  
Data summary
Name baked
Number of rows 2623
Number of columns 59
_______________________
Column type frequency:
numeric 59
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
game_id 0 1 88298.34 76614.03 2.00 10906.00 69601.00 159125.00 244522.00 ▇▂▂▃▂
min_players 0 1 2.02 0.65 1.00 2.00 2.00 2.00 8.00 ▇▁▁▁▁
max_players 0 1 4.70 2.51 1.00 4.00 4.00 6.00 20.00 ▇▆▁▁▁
avg_time 0 1 80.37 65.29 5.00 30.00 60.00 120.00 240.00 ▇▅▂▁▂
min_time 0 1 67.91 59.52 1.00 30.00 45.00 90.00 240.00 ▇▅▁▁▁
max_time 0 1 80.67 65.14 5.00 30.00 60.00 120.00 240.00 ▇▆▂▁▂
year 0 1 2006.41 13.77 1900.00 2003.00 2010.00 2015.00 2018.00 ▁▁▁▁▇
num_votes 0 1 6.62 1.27 4.14 5.63 6.43 7.40 11.26 ▅▇▅▂▁
age 0 1 10.46 3.20 0.00 8.00 12.00 12.00 42.00 ▃▇▁▁▁
owned 0 1 7.24 1.13 3.89 6.42 7.09 7.92 11.62 ▁▇▇▂▁
geek_rating 0 1 6.09 0.49 5.64 5.73 5.90 6.30 8.50 ▇▂▁▁▁
cat_card_game 0 1 0.28 0.45 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
cat_wargame 0 1 0.18 0.38 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
cat_fantasy 0 1 0.15 0.36 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
cat_economic 0 1 0.11 0.32 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_fighting 0 1 0.11 0.31 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_science_fiction 0 1 0.09 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_dice 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_bluffing 0 1 0.06 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_adventure 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_miniatures 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_ww2 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_medieval 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_explor 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_deduct 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_party 0 1 0.05 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_abstract 0 1 0.05 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_animals 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_dice 0 1 0.28 0.45 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
mech_hand 0 1 0.27 0.44 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
mech_powers 0 1 0.18 0.39 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
mech_sets 0 1 0.15 0.35 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
mech_infl 0 1 0.13 0.33 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_draft 0 1 0.12 0.32 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_modular 0 1 0.12 0.32 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_tile 0 1 0.11 0.31 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_hex 0 1 0.09 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_action 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_coop 0 1 0.08 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_sas 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_auction 0 1 0.08 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_area 0 1 0.07 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_worker 0 1 0.07 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_grid 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_simul 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_deck 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_partner 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_point 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_route 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_1 0 1 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_2 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_3 0 1 0.01 0.10 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_4 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_5 0 1 0.01 0.10 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_6 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_7 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_8 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_9 0 1 0.01 0.08 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_10 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁

Specify model

spec <- rand_forest(mtry = 35, min_n = 12, trees = 1500) %>%
  set_mode("regression") %>%
  set_engine("ranger")

Specify workflow

wkflow <- workflow() %>%
  add_recipe(rec) %>%
  add_model(spec)

wkflow  
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────
7 Recipe Steps

• step_mutate()
• step_mutate()
• step_mutate()
• step_rm()
• step_log()
• step_mutate()
• step_impute_mean()

── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)

Main Arguments:
  mtry = 35
  trees = 1500
  min_n = 12

Computational engine: ranger 

Tune

Tuning happened but won’t be included because it took so long.

set.seed(530)
train_folds <- vfold_cv(train)

set.seed(3621)
ranger_tune <-
  tune_grid(wkflow,
    resamples = train_folds,
    grid = 11
  )

show_best(ranger_tune, metric = "rmse") %>%
  kable( 
    escape = FALSE,
    padding = 5
  ) %>% 
  kable_styling(
    bootstrap_options = c("striped", "hover"), 
    fixed_thead = TRUE, 
    position = "left"
  ) %>%
  row_spec(0, color = "white", background = table_colour)

autoplot(ranger_tune) +
  theme_bw()

Finalise model

The results were:

  • mtry = 35
  • min_n = 12
final_wkflow <- wkflow #%>%
  #finalize_workflow(select_best(ranger_tune, metric = "rmse"))

final_wkflow
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────
7 Recipe Steps

• step_mutate()
• step_mutate()
• step_mutate()
• step_rm()
• step_log()
• step_mutate()
• step_impute_mean()

── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)

Main Arguments:
  mtry = 35
  trees = 1500
  min_n = 12

Computational engine: ranger 

Fit model

ranger_fit <- last_fit(final_wkflow, split)

Check performance

collect_metrics(ranger_fit) %>%
  kable( 
    escape = FALSE,
    padding = 5
  ) %>% 
  kable_styling(
    bootstrap_options = c("striped", "hover"), 
    fixed_thead = TRUE, 
    position = "left"
  ) %>%
  row_spec(0, color = "white", background = table_colour)
.metric .estimator .estimate .config
rmse standard 0.1803817 Preprocessor1_Model1
rsq standard 0.8566090 Preprocessor1_Model1

Get predictions

ranger_preds <- collect_predictions(ranger_fit)

ranger_preds %>%
  ggplot(aes(geek_rating, .pred)) +
  geom_abline(lty = 2, colour = pal_sliced[3]) +
  geom_point(alpha = 0.5) +
  coord_fixed() +
  labs(title = 'Predicted vs actual')

Apply to new data

have_scored <- cbind(to_score, predict(ranger_fit$.workflow[[1]], to_score)) %>%
  select(game_id, geek_rating = .pred)

Output submission

write_csv(have_scored, 's01e01/submission.csv')

Outcome

While this was a late submission, the RMSE for my submission was 0.17972, which would put me about 10th on the private leaderboard

Possible improvements

  • Try different model types
  • Is there a better way to create flags for the categories and mechanics - yes using the textrecipes package
banner